Purpose

This document is an adaptation of the “Use OpenAI text embeddings for horror movie descriptions” by Julia Silge.

The goal is to apply my training in R towards real world projects, including the use of OpenAI towards a data science project. The data used here is from the TidyTuesday project.

#load r packages 
library(tidytuesdayR)
library(tidymodels)
library(httr)

Data Exploration

#get horror movie data from GitHub using tidytuesdayR package
tuesdata <- tt_load("2022-11-01")
horror_movies <- tuesdata$horror_movies
#create a sample of the horror movie data by filtering movies that are in the English language and remove movies that do not have an overview

#use slice sample to randomly select rows
set.seed(123)

horror_movies_df <- horror_movies %>%
  filter(!is.na(overview), original_language == "en") %>%
  slice_sample(n = 1000)

glimpse(horror_movies_df)
## Rows: 1,000
## Columns: 20
## $ id                <dbl> 751453, 753328, 696605, 46020, 217787, 698676, 14229…
## $ original_title    <chr> "Sushi Night", "Spout", "What Josiah Saw", "Sharktop…
## $ title             <chr> "Sushi Night", "Spout", "What Josiah Saw", "Sharktop…
## $ original_language <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en"…
## $ overview          <chr> "After having a dinner date, a man realizes his love…
## $ tagline           <chr> NA, NA, "You do what need be done then.", "Half-shar…
## $ release_date      <date> 2020-10-08, 2009-11-21, 2021-08-13, 2010-09-25, 201…
## $ poster_path       <chr> "/s43doT1jZ1yrTibqddL4l2ekHaJ.jpg", "/1WXajyutGGPlms…
## $ popularity        <dbl> 0.600, 0.600, 5.622, 8.925, 4.859, 0.871, 3.221, 2.0…
## $ vote_count        <dbl> 0, 0, 23, 138, 46, 0, 47, 6, 4, 45, 0, 0, 1, 5, 4, 7…
## $ vote_average      <dbl> 0.0, 0.0, 6.0, 4.5, 3.9, 0.0, 5.1, 4.5, 7.9, 4.3, 0.…
## $ budget            <dbl> 0, 0, 0, 0, 3600000, 0, 0, 0, 0, 0, 0, 0, 0, 4000, 0…
## $ revenue           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30000, …
## $ runtime           <dbl> 9, 17, 120, 89, 89, 84, 93, 74, 112, 88, 9, 75, 85, …
## $ status            <chr> "Released", "Released", "Released", "Released", "Rel…
## $ adult             <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ backdrop_path     <chr> NA, NA, "/d3rvdCFRHydPhb9bxnMBUFMEA9I.jpg", "/lHVxlW…
## $ genre_names       <chr> "Horror, Thriller", "Drama, Horror", "Horror, Thrill…
## $ collection        <dbl> NA, NA, NA, 370374, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ collection_name   <chr> NA, NA, NA, "Sharktopus Collection", NA, NA, NA, NA,…
#check out 3 random samples of movie overviews
sample(horror_movies_df$overview, size = 3)
## [1] "Victor Reynolds arrives at the notorious House of Usher, whereupon he is greeted by old acquaintances Roderick and Madeline Usher and their servant, Markus. As Victor uncovers more about the history of the house and the disappearances of those that entered it previously, he begins to realize that he is in mortal danger."
## [2] "After having a feud with director Kenneth J. Hall, producer Fred Olen Ray hired Ted Newsom to shoot brand new footage (on video) to weave in with scenes from Hall's film Evil Spawn (1987)."                                                                                                                                     
## [3] "A 16mm psychodrama about a young woman who, obsessed with transcribing her thoughts to a myriad of post-it notes, finds herself struggling to escape a surreal anxiety attack."

Set OpenAI API keys as envinroment variables using Sys.setenv() function

#make API call to OpenAI

#text embeddings are representations of text learned from large datasets

embeddings_url <- "https://api.openai.com/v1/embeddings"
auth <- add_headers(Authorization = paste("Bearer", 
                                          Sys.getenv("OPENAI_API_KEY")))

body <- list(model = "text-embedding-ada-002", input = horror_movies_df$overview)
#call to OpenAI

resp <- POST(
  embeddings_url, 
  auth, 
  body = body, 
  encode = "json"
)

Check resp$status_code to confirm API call

#200 = success
#401 = lack of valid auth credentials
resp$status_code
## [1] 200
#convert response from one large text chunk to a list using flatten

embeddings <- content(resp, as = "text", encoding = "UTF-8") %>%
  jsonlite::fromJSON(flatten = TRUE)

#pluck data 
#text is clustered by closeness
embed_extract <- embeddings %>% 
  pluck("data", "embedding")
#add extracted embeddings as a new column in the horror movie data
horror_embed <- horror_movies_df %>%
  mutate(embeddings = embed_extract)

horror_embed %>%
  select(id, original_title, embeddings)
## # A tibble: 1,000 × 3
##        id original_title              embeddings   
##     <dbl> <chr>                       <list>       
##  1 751453 Sushi Night                 <dbl [1,536]>
##  2 753328 Spout                       <dbl [1,536]>
##  3 696605 What Josiah Saw             <dbl [1,536]>
##  4  46020 Sharktopus                  <dbl [1,536]>
##  5 217787 Paranormal Whacktivity      <dbl [1,536]>
##  6 698676 Dark Web: Mystery Box       <dbl [1,536]>
##  7  14229 Ti piace Hitchcock?         <dbl [1,536]>
##  8 364094 Fun Size Horror: Volume Two <dbl [1,536]>
##  9 476484 Before I Die                <dbl [1,536]>
## 10 407626 Ozark Sharks                <dbl [1,536]>
## # … with 990 more rows

Create a matrix where every row is a movie and every column is an OpenAI embedding

embeddings_mat <- matrix(
  unlist(horror_embed$embeddings), 
  ncol = 1536, 
  byrow = TRUE
)

Similarity

#compute a cosine similarity matrix

embeddings_sim <- embeddings_mat / sqrt(rowSums(embeddings_mat * embeddings_mat))
embeddings_sim <- embeddings_sim %*% t(embeddings_mat)
dim(embeddings_sim)
## [1] 1000 1000
horror_movies_df %>%
  slice(4) %>%
  select(title, overview)
## # A tibble: 1 × 2
##   title      overview                                                           
##   <chr>      <chr>                                                              
## 1 Sharktopus "The U.S. Navy's special group \"Blue Water\" builds a half-shark,…
#check out 

enframe(embeddings_sim[4, ], name = "movie", value = "similarity") %>%
  arrange(-similarity)
## # A tibble: 1,000 × 2
##    movie similarity
##    <int>      <dbl>
##  1     4      1.00 
##  2   935      0.857
##  3   379      0.849
##  4   380      0.847
##  5   533      0.841
##  6   898      0.840
##  7   605      0.837
##  8   914      0.826
##  9   745      0.825
## 10   849      0.825
## # … with 990 more rows
horror_movies_df %>%
  slice(c(935, 379, 380)) %>%
  select(title, overview)
## # A tibble: 3 × 2
##   title                        overview                                         
##   <chr>                        <chr>                                            
## 1 Octaman                      "A scientific team in Mexico discover a pool of …
## 2 Dark Waters                  "Moneyless, ocean-exploring gigolo and his world…
## 3 Mega Shark vs. Giant Octopus "The California coast is terrorized by two enorm…

PCA

PC1 explains the most “variation” in the text

#identify first 32 principal components
#use prcopm_irlba for faster computation

set.seed(234)
horror_pca <- irlba::prcomp_irlba(embeddings_mat, n = 32)
augmented_pca <- as_tibble(horror_pca$x) %>%
  bind_cols(horror_movies_df)

PC Plots

Plot principal components

augmented_pca %>%
  #PC1 vs PC2 colored by vote average
  ggplot(aes(x = PC1, y = PC2, color = vote_average)) + 
  geom_point(size = 1.2, alpha = 0.8) + 
  scale_color_viridis_c()

We observe that vote is unrelated to text. Movie description does not influence ratings.

#make plot interactive
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#assign previously constructed plot to variable p
p <- augmented_pca %>%
  #PC1 vs PC2 colored by vote average
  ggplot(aes(x = PC1, y = PC2, color = vote_average)) + 
  geom_point(size = 1.2, alpha = 0.8) + 
  scale_color_viridis_c()

#run in ggplotly function to explore
ggplotly(p, tooltip = "text")